{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "def createList():\n",
    "    v1 = chr(random.randint(65,90))\n",
    "    v2 = chr(random.randint(97,122))\n",
    "    return random.choice([v1,v2])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 构造一个100万个随机大小字母组成的数据集"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "arr = [createList() for i in range(1000000)]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "['J', 'f', 'U', 'j', 'r', 'p', 'c', 'L', 'a', 'J', 'C', 'l', 'R', 'v', 'B', 'R', 'M', 'Z', 'a', 'E', 'Q', 'i', 'B', 'o', 'z', 'f', 'O', 'q', 'L', 'x', 'L', 'P', 'i', 'y', 'v', 'S', 'g', 'G', 'o', 'k', 'r', 's', 'n', 'v', 'H', 'K', 'j', 'A', 'Y', 'i', 'q', 't', 'j', 'h', 'T', 'G', 'S', 'O', 'y', 'R', 'Z', 'k', 'O', 'Q', 'o', 'z', 'x', 'x', 'd', 'O', 'f', 'L', 'S', 'E', 'm', 'o', 'w', 'v', 'l', 'J', 'E', 'W', 'y', 'p', 'D', 'X', 'I', 't', 'I', 'g', 'o', 't', 'V', 'A', 'd', 'G', 'B', 'D', 'c', 'q']\n"
     ]
    }
   ],
   "source": [
    "print(arr[0:100])"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pyspark"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "sc = pyspark.SparkContext()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "rdd = sc.parallelize(arr)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 对数据集进行转化：转化为(key,value)模式"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 其中，key是原来的字母，value可以是用来计算的内容，比如计数的话，就是1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "metadata": {},
   "outputs": [],
   "source": [
    "maprdd = rdd.map(lambda a:(a,1))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('J', 1), ('f', 1), ('U', 1), ('j', 1), ('r', 1), ('p', 1), ('c', 1), ('L', 1), ('a', 1), ('J', 1), ('C', 1), ('l', 1), ('R', 1), ('v', 1), ('B', 1), ('R', 1), ('M', 1), ('Z', 1), ('a', 1), ('E', 1), ('Q', 1), ('i', 1), ('B', 1), ('o', 1), ('z', 1), ('f', 1), ('O', 1), ('q', 1), ('L', 1), ('x', 1), ('L', 1), ('P', 1), ('i', 1), ('y', 1), ('v', 1), ('S', 1), ('g', 1), ('G', 1), ('o', 1), ('k', 1), ('r', 1), ('s', 1), ('n', 1), ('v', 1), ('H', 1), ('K', 1), ('j', 1), ('A', 1), ('Y', 1), ('i', 1), ('q', 1), ('t', 1), ('j', 1), ('h', 1), ('T', 1), ('G', 1), ('S', 1), ('O', 1), ('y', 1), ('R', 1), ('Z', 1), ('k', 1), ('O', 1), ('Q', 1), ('o', 1), ('z', 1), ('x', 1), ('x', 1), ('d', 1), ('O', 1), ('f', 1), ('L', 1), ('S', 1), ('E', 1), ('m', 1), ('o', 1), ('w', 1), ('v', 1), ('l', 1), ('J', 1), ('E', 1), ('W', 1), ('y', 1), ('p', 1), ('D', 1), ('X', 1), ('I', 1), ('t', 1), ('I', 1), ('g', 1), ('o', 1), ('t', 1), ('V', 1), ('A', 1), ('d', 1), ('G', 1), ('B', 1), ('D', 1), ('c', 1), ('q', 1)]\n"
     ]
    }
   ],
   "source": [
    "print(maprdd.take(100))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 统计一下"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('J', 19127), ('r', 19367), ('l', 19431), ('i', 19512), ('O', 19144), ('y', 19344), ('g', 19303), ('s', 19238), ('N', 19116), ('a', 19133), ('B', 19380), ('z', 19407), ('H', 19347), ('m', 19286), ('e', 19219), ('j', 19178), ('C', 19070), ('R', 19145), ('W', 19192), ('Z', 19289), ('q', 19222), ('k', 19084), ('n', 19372), ('t', 18921), ('X', 19141), ('I', 19328), ('V', 19202), ('p', 19272), ('c', 19339), ('L', 19139), ('K', 19294), ('b', 19380), ('U', 19154), ('M', 19034), ('Q', 19093), ('x', 19158), ('A', 19231), ('T', 19290), ('w', 18995), ('F', 19353), ('S', 19072), ('h', 19322), ('d', 19214), ('f', 19206), ('v', 19338), ('E', 19272), ('o', 19261), ('P', 19297), ('G', 19216), ('Y', 19157), ('D', 19162), ('u', 19253)]\n"
     ]
    }
   ],
   "source": [
    "res = maprdd.reduceByKey(lambda x,y : x+y).collect()\n",
    "print(res)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### 也可以在map里面写入逻辑表达，比如小写字母计数为1，大写字母计数为2"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "metadata": {},
   "outputs": [],
   "source": [
    "def myMap(w):\n",
    "    if ord(w) >=97:\n",
    "        return (w,1)\n",
    "    else:\n",
    "        return (w,2)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "metadata": {},
   "outputs": [],
   "source": [
    "maprdd2 = rdd.map(lambda a: myMap(a))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('J', 2), ('f', 1), ('U', 2), ('j', 1), ('r', 1), ('p', 1), ('c', 1), ('L', 2), ('a', 1), ('J', 2), ('C', 2), ('l', 1), ('R', 2), ('v', 1), ('B', 2), ('R', 2), ('M', 2), ('Z', 2), ('a', 1), ('E', 2), ('Q', 2), ('i', 1), ('B', 2), ('o', 1), ('z', 1), ('f', 1), ('O', 2), ('q', 1), ('L', 2), ('x', 1), ('L', 2), ('P', 2), ('i', 1), ('y', 1), ('v', 1), ('S', 2), ('g', 1), ('G', 2), ('o', 1), ('k', 1), ('r', 1), ('s', 1), ('n', 1), ('v', 1), ('H', 2), ('K', 2), ('j', 1), ('A', 2), ('Y', 2), ('i', 1), ('q', 1), ('t', 1), ('j', 1), ('h', 1), ('T', 2), ('G', 2), ('S', 2), ('O', 2), ('y', 1), ('R', 2), ('Z', 2), ('k', 1), ('O', 2), ('Q', 2), ('o', 1), ('z', 1), ('x', 1), ('x', 1), ('d', 1), ('O', 2), ('f', 1), ('L', 2), ('S', 2), ('E', 2), ('m', 1), ('o', 1), ('w', 1), ('v', 1), ('l', 1), ('J', 2), ('E', 2), ('W', 2), ('y', 1), ('p', 1), ('D', 2), ('X', 2), ('I', 2), ('t', 1), ('I', 2), ('g', 1), ('o', 1), ('t', 1), ('V', 2), ('A', 2), ('d', 1), ('G', 2), ('B', 2), ('D', 2), ('c', 1), ('q', 1)]\n"
     ]
    }
   ],
   "source": [
    "print(maprdd2.take(100))"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "[('J', 38254), ('r', 19367), ('l', 19431), ('i', 19512), ('O', 38288), ('y', 19344), ('g', 19303), ('s', 19238), ('N', 38232), ('a', 19133), ('B', 38760), ('z', 19407), ('H', 38694), ('m', 19286), ('e', 19219), ('j', 19178), ('C', 38140), ('R', 38290), ('W', 38384), ('Z', 38578), ('q', 19222), ('k', 19084), ('n', 19372), ('t', 18921), ('X', 38282), ('I', 38656), ('V', 38404), ('p', 19272), ('c', 19339), ('L', 38278), ('K', 38588), ('b', 19380), ('U', 38308), ('M', 38068), ('Q', 38186), ('x', 19158), ('A', 38462), ('T', 38580), ('w', 18995), ('F', 38706), ('S', 38144), ('h', 19322), ('d', 19214), ('f', 19206), ('v', 19338), ('E', 38544), ('o', 19261), ('P', 38594), ('G', 38432), ('Y', 38314), ('D', 38324), ('u', 19253)]\n"
     ]
    }
   ],
   "source": [
    "res2 = maprdd2.reduceByKey(lambda x,y : x+y).collect()\n",
    "print(res2)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.6.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
